script src = "https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js" $(document).ready(function() {
$('body').prepend('<div class=\"zoomDiv\"><img src=\"\" class=\"zoomImg\"></div>');
// onClick function for all plots (img's)
$('img:not(.zoomImg)').click(function() {
$('.zoomImg').attr('src', $(this).attr('src')).css({width: '100%'});
$('.zoomDiv').css({opacity: '1', width: 'auto', border: '1px solid white', borderRadius: '5px', position: 'fixed', top: '50%', left: '50%', marginRight: '-50%', transform: 'translate(-50%, -50%)', boxShadow: '0px 0px 50px #888888', zIndex: '50', overflow: 'auto', maxHeight: '100%'});
});
// onClick function for zoomImg
$('img.zoomImg').click(function() {
$('.zoomDiv').css({opacity: '0', width: '0%'});
});
});
<script src="hideOutput.js"></script> $(document).ready(function() {
$chunks = $('.fold');
$chunks.each(function () { // add button to source code chunks
if ( $(this).hasClass('s') ) {
$('pre.r', this).prepend("<div class=\"showopt\">Show Source</div><br style=\"line-height:22px;\"/>");
$('pre.r', this).children('code').attr('class', 'folded');
} // add button to output chunks
if ( $(this).hasClass('o') ) {
$('pre:not(.r)', this).has('code').prepend("<div class=\"showopt\">Show Output</div><br style=\"line-height:22px;\"/>");
$('pre:not(.r)', this).children('code:not(r)').addClass('folded'); // add button to plots
$(this).find('img').wrap('<pre class=\"plot\"></pre>');
$('pre.plot', this).prepend("<div class=\"showopt\">Show Plot</div><br style=\"line-height:22px;\"/>");
$('pre.plot', this).children('img').addClass('folded');
}
}); // hide all chunks when document is loaded
$('.folded').css('display', 'none') // function to toggle the visibility
$('.showopt').click(function() {
var label = $(this).html();
if (label.indexOf("Show") >= 0) {
$(this).html(label.replace("Show", "Hide"));
} else {
$(this).html(label.replace("Hide", "Show"));
}
$(this).siblings('code, img').slideToggle('fast', 'swing');
});
}); Generate flowcharts of joins
Kappa y MLs for classification
### 1.- What is Base_fiscalia_v9, why it has 49,970 IDs & 174,961 rows : paste0("Patients in PO records with unique combination of ID, RUC, end type and date of comission of the offense and offense ", a_tab11_lab_aft_d1 (p= 5,470; RUCs= 5,993; n= 8,291) --> should be only people that commit offenses, not victims)
### 2.- why the difference of 49,970- 49,252????
### 3.-
Base_fiscalia_v10b_dic_2022<-
sqldf("SELECT *
FROM CONS_C1_df_dup_SEP_2020_22_d AS x
LEFT JOIN (SELECT *
FROM Base_fiscalia_v9
) AS y
ON x.hash_key == y.id AND
x.dup = 1
") #2022-11-25 added dup // #changed the direction to past events, where age at discharge is greater than the age of commission //FEB 2023: WHERE ano_bd_first='2015' OR ano_bd_first='2016' OR ano_bd_first='2017' OR ano_bd_first='2018' OR ano_bd_first='2019'
message(
paste0("First, for each baseline admission of patients in C1 (", format(length(unique(CONS_C1_df_dup_SEP_2020_22_d$hash_key)), big.mark=","), "; n= ",format(nrow(CONS_C1_df_dup_SEP_2020_22_d), big.mark=","),"), we found the PO records in which is found as the offender (p=", format(length(unique(Base_fiscalia_v9$id)), big.mark=","), "; n= ",format(nrow(Base_fiscalia_v9), big.mark=","),") (p= ", format(length(unique(Base_fiscalia_v10b_dic_2022$hash_key)), big.mark=","), "; n= ",format(nrow(Base_fiscalia_v10b_dic_2022), big.mark=","),"). We did not exclude patients in C1 that were registered in databases before 2015 in this step (Feb 2023)")
)
invisible("49,970 patients")
Base_fiscalia_v11b_dic_2022<-
Base_fiscalia_v10b_dic_2022 %>%
#discrepancies in names of variables
janitor::clean_names() %>% #janitor::tabyl(!is.na(dob_imp_num))
#previously recoded,
dplyr::select(-dateofbirth_imp, -country, -victim, -id_victim, -crime_code_c , -reg_c, -end_type_2c, -cod_comunadelito, -cod_lugarocurrencia, -sex_imp, -region_delito, -filter, -id)%>%
plyr::rename(c("dateofbirth_imp_2"="dateofbirth_imp")) %>%
dplyr::ungroup() %>%
#selected the first row with distinct information regarding patient ID, case ID, crime code.
dplyr::group_by(hash_key, dateofbirth_imp, caseid, crime_code_group_rec_prof) %>%
dplyr::slice(1) %>%
dplyr::ungroup() %>%
dplyr::group_by(hash_key,dateofbirth_imp) %>%
summarise(n_off_acq= ifelse(sum(crime_code_group_rec_prof=="Acquisitive", na.rm=T)>0, 1,0), n_off_vio= ifelse(sum(crime_code_group_rec_prof=="Violent", na.rm=T)>0, 1,0), n_off_sud= ifelse(sum(crime_code_group_rec_prof== "Substance-related", na.rm=T)>0, 1,0), n_off_oth= ifelse(sum(crime_code_group_rec_prof== "Other", na.rm=T)>0, 1,0)) %>%
dplyr::ungroup() %>%
dplyr::mutate(n_off= rowSums(select(., starts_with("n_"))))
message(paste0( "Afterwards, we counted the offenses by type of offense (Acquisitive, violent, substance-related and other) and the total by each user before baseline admisson (p= ", format(length(unique(Base_fiscalia_v11b_dic_2022$hash_key)), big.mark=","), "; cases with records in P.O.= ",format(as.numeric(table(is.na(Base_fiscalia_v10b_dic_2022$dateofbirth_imp))[[2]]), big.mark=","),")." ))
#Registrar hurtos, robos, violencia intrafamiliar y otras acciones cometidas en las últimas 4 semanas
#Violencia Intrafamiliar (Maltrato físico o psicológico)
CONS_TOP_2022<-
# 107307
CONS_TOP%>%
# dplyr::left_join(subset(dplyr::mutate(dplyr::group_by(Base_fiscalia_v9, id), hash_rn=row_number())%>% ungroup(), hash_rn==1), by= c("HASH_KEY" = "id"))%>%
dplyr::left_join(subset(dplyr::mutate(dplyr::group_by(Base_fiscalia_v11b_dic_2022, hash_key), hash_rn=row_number())%>% ungroup(), hash_rn==1), by= c("HASH_KEY" = "hash_key"))%>%
dplyr::mutate(fech_ap_top_num= as.numeric(as.Date(str_sub(as.character(lubridate::parse_date_time(Fecha.Aplicación.TOP, c("%Y-%m-%d"),exact=T)),1,10))))%>% #No parse failures
dplyr::select(HASH_KEY, fech_ap_top_num, Fecha.Aplicación.TOP, dateofbirth_imp, Hurto, Robo, Venta.Drogas, Riña, Total.VIF, Otro) %>%
dplyr::filter(!is.na(HASH_KEY)) %>%
dplyr::mutate_at(vars("Hurto", "Robo", "Venta.Drogas", "Riña", "Otro"), ~ifelse(.=="S",1,0)) %>%
dplyr::mutate(Total.VIF= ifelse(Total.VIF>0,1,0))%>%
dplyr::mutate(tot_off_top = base::rowSums(dplyr::select(.,c(Hurto, Robo, Venta.Drogas, Riña, Total.VIF, Otro)), na.rm = T)) %>%
dplyr::mutate(dateofbirth_imp_num= as.numeric(dateofbirth_imp),
fech_ap_top= lubridate::parse_date_time(Fecha.Aplicación.TOP, c("%Y-%m-%d"),exact=T),
edad_a_ap_top_num= lubridate::time_length(lubridate::interval(dateofbirth_imp, fech_ap_top),unit="years"),
edad_b_ap_top_num= (fech_ap_top_num-dateofbirth_imp_num)/365.25,
edad_a_ap_top_num_lim= edad_a_ap_top_num-(1/12),
edad_b_ap_top_num_lim= edad_b_ap_top_num-(1/12)) %>%
dplyr::select(-dateofbirth_imp, -dateofbirth_imp_num) %>%
dplyr::filter(!is.na(edad_a_ap_top_num)) %>%
dplyr::group_by(HASH_KEY, edad_a_ap_top_num) %>%
dplyr::slice(1) %>%
dplyr::ungroup()message(paste0("Then, we standardized the varibles in TOP (e.g., dates format) and counted self-reported offenses, and also we standardized data with PO records (dates of birth) to get the age when the patient have answered TOP and the age previous [ (1/12)*365.25 ]=",round((1/12)*365.25,0)," days before TOP application. This resulted in ", format(nrow(CONS_TOP_2022), big.mark=","), " rows (combination of different application dates and patients) of ", format(length(unique(CONS_TOP_2022$HASH_KEY)), big.mark=","), " patients."))
Base_fiscalia_v9_dic_2022<-
Base_fiscalia_v9 %>%
dplyr::group_by(id, caseid, end_type, fec_comision_simple, crime_code_c) %>%
dplyr::slice(1) %>%
dplyr::ungroup()
message(
paste0("We reduced PO records to those that do not have the same sentence for the same offense committed at the same date in the same process (case ID/RUC) (p= ", format(length(unique(Base_fiscalia_v9_dic_2022$id)), big.mark=","), "; n= ",format(nrow(Base_fiscalia_v9_dic_2022), big.mark=","),")")
)
#busco en la base de datos de fiscalía, algún delito que haya cometido en el transcurso
#del último mes anterior a la aplicación del TOP
Base_fiscalia_v13c_dic_2022<-
sqldf("SELECT *
FROM CONS_TOP_2022 AS x
LEFT JOIN (SELECT *
FROM Base_fiscalia_v9_dic_2022
) AS y
ON x.HASH_KEY == y.id AND
x.edad_a_ap_top_num > y.age_offending_imp AND
x.edad_a_ap_top_num_lim < y.age_offending_imp") #2022-11-25 added dup // #changed the direction to past events, where age at discharge is greater than the age of commission
message(
paste0("Of the total of TOP applications (p= ", format(length(unique(CONS_TOP_2022$HASH_KEY)), big.mark=","), "; n= ",format(nrow(CONS_TOP_2022), big.mark=","),"), we looked for offenses comitted in the period of thirty days before (`x.edad_a_ap_top_num_lim`) the application of TOP (p= ", format(length(unique(Base_fiscalia_v13c_dic_2022$HASH_KEY)), big.mark=","), "; n= ",format(nrow(Base_fiscalia_v13c_dic_2022), big.mark=","),"). If this database contains more records, is beacause there may be some patients that had more than one record as a result of more than one application of TOP. This will be resolved in the next step")
)
#Luego, contar por cada combinación de HASH y fecha de aplicación, el número de delitos segun familai de delito
#crime_code_group
Base_fiscalia_v13c_dic_2022_2<-
Base_fiscalia_v13c_dic_2022 %>%
dplyr::select(tidyr::any_of(c("HASH_KEY", "fech_ap_top_num", "Fecha.Aplicación.TOP", "Hurto", "Robo", "Venta.Drogas", "Riña",
"Total.VIF", "Otro", "tot_off_top", "fech_ap_top", "edad_a_ap_top_num", "edad_b_ap_top_num",
"edad_a_ap_top_num_lim", "edad_b_ap_top_num_lim", "caseid", "end_type", "fec_comision_simple",
"crime_code_c", "crime_code_group_rec", "crime_code_group_rec_prof", "age_offending_imp")))
#HASH_KEY
#fech_ap_top_num
#Fecha.Aplicación.TOP
#Hurto
#Robo
#Venta.Drogas
#Riña
#Total.VIF
#Otro
#tot_off_top
#fech_ap_top
#edad_a_ap_top_num
#edad_b_ap_top_num
#edad_a_ap_top_num_lim
#edad_b_ap_top_num_lim
#caseid
#end_type
#fec_comision_simple
#crime_code_c
#crime_code_group_rec_prof
#crime_code_group_rec
#age_offending_imp
invisible("# 131 rows added, why?")
message("Patients than had more than one combination of application date and ID. That means, they answered TOP more than one time, and in at least 2 of these times they had previous records: "); Base_fiscalia_v13c_dic_2022_2 %>%
dplyr::count(HASH_KEY, fech_ap_top_num) %>%
dplyr::filter(n>1) %>%
dplyr::summarise(sum=sum(n), n=n(), tot=sum-n)
sum n tot
1 245 114 131
Base_fiscalia_v13c_dic_2022_3<-
Base_fiscalia_v13c_dic_2022_2 %>%
dplyr::ungroup() %>%
dplyr::mutate(vio=dplyr::if_else(crime_code_group_rec_prof=="Violent",1,0,0),
acq=dplyr::if_else(crime_code_group_rec_prof=="Acquisitive",1,0,0),
sud=dplyr::if_else(crime_code_group_rec_prof=="Substance-related",1,0,0),
oth=dplyr::if_else(crime_code_group_rec_prof=="Other",1,0,0)) %>%
dplyr::mutate(tot= rowSums(select(., c("vio","acq","sud","oth")), na.rm=T)) %>%
dplyr::mutate(acq_top= rowSums(select(., c("Hurto","Robo")), na.rm=T)) %>%
dplyr::mutate(sud_top= rowSums(select(., c("Venta.Drogas")), na.rm=T)) %>%
dplyr::mutate(vio_top= rowSums(select(., c("Riña", "Total.VIF")), na.rm=T)) %>%
dplyr::mutate(oth_top= rowSums(select(., c("Otro")), na.rm=T)) %>%
dplyr::group_by(HASH_KEY, fech_ap_top_num) %>%
dplyr::mutate(vio=sum(vio,na.rm=T), acq=sum(acq,na.rm=T), sud=sum(sud,na.rm=T),
oth=sum(oth,na.rm=T), tot=sum(tot,na.rm=T), n=n()) %>%
dplyr::slice(1) %>%
dplyr::ungroup()
#Los que tienen todo pelado en caseid es porque no tienen fecha de comisión en los últimos 30 días
message(
paste0("For each combination of TOP application, count the number of previous offenses in PO records (those with 0s are people that did not committed an offense recorded by PO in the period previous to the application of TOP) (p= ", format(length(unique(Base_fiscalia_v13c_dic_2022_3$HASH_KEY)), big.mark=","), "; n= ",format(nrow(Base_fiscalia_v13c_dic_2022_3), big.mark=","),")")
)
We compared the records of TOP (Hurto,Robo, Venta.Drogas, Riña, Total.VIF, Otro, & tot_off_top) with the records of PO (vio, acq, sud, oth & tot). We grouped TOPs self-reports of offenses related to burglary or robbery into acquisitive (acq_top), the reports of drug-selling into substance related (sud_top) and reports of fights and domestic violence into violent (vio_top) and Other into other (oth_top).
cat("Recode to binary measures")
Recode to binary measures
Base_fiscalia_v13c_dic_2022_3$tot_off_top_bin<-
ifelse(Base_fiscalia_v13c_dic_2022_3$tot_off_top>=1,1,0)
Base_fiscalia_v13c_dic_2022_3$tot_bin<-
ifelse(Base_fiscalia_v13c_dic_2022_3$tot>=1,1,0)
Base_fiscalia_v13c_dic_2022_3$vio_bin<-
ifelse(Base_fiscalia_v13c_dic_2022_3$vio>=1,1,0)
Base_fiscalia_v13c_dic_2022_3$acq_bin<-
ifelse(Base_fiscalia_v13c_dic_2022_3$acq>=1,1,0)
Base_fiscalia_v13c_dic_2022_3$sud_bin<-
ifelse(Base_fiscalia_v13c_dic_2022_3$sud>=1,1,0)
Base_fiscalia_v13c_dic_2022_3$oth_bin<-
ifelse(Base_fiscalia_v13c_dic_2022_3$oth>=1,1,0)
Base_fiscalia_v13c_dic_2022_3$acq_top_bin<-
ifelse(Base_fiscalia_v13c_dic_2022_3$acq_top>=1,1,0)
Base_fiscalia_v13c_dic_2022_3$sud_top_bin<-
ifelse(Base_fiscalia_v13c_dic_2022_3$sud_top>=1,1,0)
Base_fiscalia_v13c_dic_2022_3$vio_top_bin<-
ifelse(Base_fiscalia_v13c_dic_2022_3$vio_top>=1,1,0)
Base_fiscalia_v13c_dic_2022_3$oth_top_bin<-
ifelse(Base_fiscalia_v13c_dic_2022_3$oth_top>=1,1,0)
cat("::: {.panel}", "\n", "[Total]{.panel-name}", "\n")
cat("**Total records of offenses vs. TOP reported offenses**")
paste0("**Records from PO**")
paste0("**Reports from TOP**")
pol_p<-
polychor(Base_fiscalia_v13c_dic_2022_3$tot_off_top, Base_fiscalia_v13c_dic_2022_3$tot, std.err=T, ML=T)
pol_p2<-
psych::polychoric(table(Base_fiscalia_v13c_dic_2022_3$tot_off_top, Base_fiscalia_v13c_dic_2022_3$tot))
#rho : The (matrix) of tetrachoric/polychoric/biserial correlations
#tau : The normal equivalent of the cutpoints
#for ordinal variables, the same idea holds...but now we have more than one τ-cut to subdivide the normal distribution into regions associated with the different values of the ordinal measure..
# polychoric is appropriate when the manifest ordinal variables came from categorizing latent normal variables & not otherwise. (In practice, it's more like when you are willing to assume thi
#We start with a relative frequency pattern for our contingency table of ordinal data in the lower-left, and we find the optimal solution for correlation and tau-cuts to get us back to a bivariate normal distribution that would have comparable relative joint-frequencies.
cat("**Polychoric correlation: Total records of offenses vs. TOP reported offenses**")
message(
capture.output(print(pol_p))[c(2,3)]
)
cat("**Overdispersion**")
#The variance is not much greater than the mean, which suggests that we will have over-dispersion in the model.
# obtained with a poisson regression
cat("**Recode**")
#For more than two classes, these results are calculated comparing each factor level to the remaining levels (i.e. a "one versus all" approach).
# accuracy should be higher than No information rate (naive classifier) in order to be model significant.
#No information rate tests whether our classifier does better than random assignment
# A hypothesis test is also computed to evaluate whether the overall accuracy rate is greater than the rate of the largest class. P-Value [Acc > NIR]
#Null Error Rate: This is how often you would be wrong if you always predicted the majority class.
# but sometimes the best classifier for a particular application will sometimes have a higher error rate than the null error rate, as in the accuracy paradox
#if the incidence of category A is dominant, being found in 99% of cases, then predicting that every case is category A will have an accuracy of 99%
#Kappa statistics is a measure of how the classification results compare to values assigned by chance. P-value mcnemar, can produce NA values with sparse tables)
pander::pander(caret::confusionMatrix(factor(Base_fiscalia_v13c_dic_2022_3$tot_off_top_rec), factor(Base_fiscalia_v13c_dic_2022_3$tot)))
positive:
table:
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| 0 | 40984 | 479 | 35 | 2 | 1 |
| 1 | 7624 | 309 | 32 | 5 | 2 |
| 2 | 2442 | 146 | 21 | 4 | 0 |
| 3 | 1140 | 68 | 7 | 0 | 0 |
| 4 | 819 | 34 | 5 | 0 | 0 |
overall:
| Accuracy | Kappa | AccuracyLower | AccuracyUpper | AccuracyNull |
|---|---|---|---|---|
| 0.7628 | 0.0401 | 0.7592 | 0.7664 | 0.9788 |
| AccuracyPValue | McnemarPValue |
|---|---|
| 1 | NA |
byClass:
| Sensitivity | Specificity | Pos Pred Value | Neg Pred Value | |
|---|---|---|---|---|
| Class: 0 | 0.7732 | 0.5504 | 0.9875 | 0.05001 |
| Class: 1 | 0.2983 | 0.8557 | 0.03876 | 0.9843 |
| Class: 2 | 0.21 | 0.9521 | 0.008037 | 0.9985 |
| Class: 3 | 0 | 0.9776 | 0 | 0.9998 |
| Class: 4 | 0 | 0.9842 | 0 | 0.9999 |
| Precision | Recall | F1 | Prevalence | Detection Rate | |
|---|---|---|---|---|---|
| Class: 0 | 0.9875 | 0.7732 | 0.8673 | 0.9788 | 0.7567 |
| Class: 1 | 0.03876 | 0.2983 | 0.06861 | 0.01913 | 0.005705 |
| Class: 2 | 0.008037 | 0.21 | 0.01548 | 0.001846 | 0.0003877 |
| Class: 3 | 0 | 0 | NA | 0.0002031 | 0 |
| Class: 4 | 0 | 0 | NA | 5.539e-05 | 0 |
| Detection Prevalence | Balanced Accuracy | |
|---|---|---|
| Class: 0 | 0.7663 | 0.6618 |
| Class: 1 | 0.1472 | 0.577 |
| Class: 2 | 0.04825 | 0.581 |
| Class: 3 | 0.02243 | 0.4888 |
| Class: 4 | 0.01584 | 0.4921 |
mode: sens_spec
dots:
cat("**Kappa with ordinal weights**")
pander::pander(irrCAC::kappa2.table(table(ordered(Base_fiscalia_v13c_dic_2022_3$tot_off_top_rec), ordered(Base_fiscalia_v13c_dic_2022_3$tot)),weights = irrCAC::ordinal.weights(0:4)))
| coeff.name | coeff.val | coeff.se | coeff.ci | coeff.pval |
|---|---|---|---|---|
| Cohen’s Kappa | 0.0349 | 0.002059 | (0.031,0.039) | 0e+00 |
#really low
#To choose between linear and quadratic weights, ask yourself if the difference between being off by 1 vs. 2 categories is the same as the difference between being off by 2 vs. 3 categories. With linear weights, the penalty is always the same (e.g., 0.33 credit is subtracted for each additional category). However, this is not the case for quadratic weights, where penalties begin mild then grow harsher.
#CONS: Depends on the marginal distribution (prevalence) of the categories
cat("**Plot**")
ggstatsplot::ggscatterstats(tot_off_top, tot, data=Base_fiscalia_v13c_dic_2022_3,
type="nonparametric", point.width.jitter=.9, point.height.jitter=.9,
xlab="Number of TOP offenses", ylab="Number of PO offenses",
title="Relationship between Total counts of offenses from PO and TOP")
cat(":::", "\n")
cat("::: {.panel}", "\n", "[Acquisitive]{.panel-name}", "\n")
cat("**Total records of offenses vs. TOP reported offenses**")
paste0("**Records from PO**")
paste0("**Reports from TOP**")
pol_p_acq<-
polychor(Base_fiscalia_v13c_dic_2022_3$acq_top, Base_fiscalia_v13c_dic_2022_3$acq, std.err=T, ML=T)
pol_p2_acq<-
psych::polychoric(table(Base_fiscalia_v13c_dic_2022_3$acq_top, Base_fiscalia_v13c_dic_2022_3$acq))
cat("**Polychoric correlation: Total records of offenses vs. TOP reported offenses**")
message(capture.output(print(pol_p_acq))[c(2,3)])
cat("**Recode**")
pander::pander(tidy(caret::confusionMatrix(factor(Base_fiscalia_v13c_dic_2022_3$acq_top), factor(Base_fiscalia_v13c_dic_2022_3$acq_rec))))
| term | class | estimate | conf.low | conf.high | p.value |
|---|---|---|---|---|---|
| accuracy | NA | 0.916 | 0.9137 | 0.9184 | 1 |
| kappa | NA | 0.04983 | NA | NA | NA |
| mcnemar | NA | NA | NA | NA | 0 |
| sensitivity | 0 | 0.9213 | NA | NA | NA |
| specificity | 0 | 0.441 | NA | NA | NA |
| pos_pred_value | 0 | 0.9952 | NA | NA | NA |
| neg_pred_value | 0 | 0.04234 | NA | NA | NA |
| precision | 0 | 0.9952 | NA | NA | NA |
| recall | 0 | 0.9213 | NA | NA | NA |
| f1 | 0 | 0.9568 | NA | NA | NA |
| prevalence | 0 | 0.9922 | NA | NA | NA |
| detection_rate | 0 | 0.9141 | NA | NA | NA |
| detection_prevalence | 0 | 0.9184 | NA | NA | NA |
| balanced_accuracy | 0 | 0.6812 | NA | NA | NA |
| sensitivity | 1 | 0.2519 | NA | NA | NA |
| specificity | 1 | 0.951 | NA | NA | NA |
| pos_pred_value | 1 | 0.03692 | NA | NA | NA |
| neg_pred_value | 1 | 0.9942 | NA | NA | NA |
| precision | 1 | 0.03692 | NA | NA | NA |
| recall | 1 | 0.2519 | NA | NA | NA |
| f1 | 1 | 0.06439 | NA | NA | NA |
| prevalence | 1 | 0.007404 | NA | NA | NA |
| detection_rate | 1 | 0.001865 | NA | NA | NA |
| detection_prevalence | 1 | 0.05052 | NA | NA | NA |
| balanced_accuracy | 1 | 0.6014 | NA | NA | NA |
| sensitivity | 2 | 0.2609 | NA | NA | NA |
| specificity | 2 | 0.9691 | NA | NA | NA |
| pos_pred_value | 2 | 0.003569 | NA | NA | NA |
| neg_pred_value | 2 | 0.9997 | NA | NA | NA |
| precision | 2 | 0.003569 | NA | NA | NA |
| recall | 2 | 0.2609 | NA | NA | NA |
| f1 | 2 | 0.007042 | NA | NA | NA |
| prevalence | 2 | 0.0004247 | NA | NA | NA |
| detection_rate | 2 | 0.0001108 | NA | NA | NA |
| detection_prevalence | 2 | 0.03104 | NA | NA | NA |
| balanced_accuracy | 2 | 0.615 | NA | NA | NA |
cat("**Kappa with ordinal weights**")
pander::pander(irrCAC::kappa2.table(table(ordered(Base_fiscalia_v13c_dic_2022_3$acq_top), ordered(Base_fiscalia_v13c_dic_2022_3$acq_rec)),weights = irrCAC::ordinal.weights(0:2)))
| coeff.name | coeff.val | coeff.se | coeff.ci | coeff.pval |
|---|---|---|---|---|
| Cohen’s Kappa | 0.04807 | 0.004008 | (0.04,0.056) | 0e+00 |
cat("**Plot**")
ggstatsplot::ggscatterstats(acq_top, acq, data=Base_fiscalia_v13c_dic_2022_3,
type="nonparametric", point.width.jitter=.9, point.height.jitter=.9,
xlab="Number of TOP offenses", ylab="Number of PO offenses",
title="Relationship between counts of Acquisitive offenses from PO and TOP")
cat(":::", "\n")
cat("::: {.panel}", "\n", "[SUD]{.panel-name}", "\n")
cat("**Total records of offenses vs. TOP reported offenses**")
paste0("**Records from PO**")
paste0("**Reports from TOP**")
pol_p_sud<-
polychor(Base_fiscalia_v13c_dic_2022_3$sud_top, Base_fiscalia_v13c_dic_2022_3$sud, std.err=T, ML=T)
pol_p2_sud<-
psych::polychoric(table(Base_fiscalia_v13c_dic_2022_3$sud_top, Base_fiscalia_v13c_dic_2022_3$sud))
cat("**Polychoric correlation: Total records of offenses vs. TOP reported offenses**")
message(capture.output(print(pol_p_sud))[c(2,3)])
cat("**Recode**")
pander::pander(tidy(
caret::confusionMatrix(factor(Base_fiscalia_v13c_dic_2022_3$sud_top), factor(Base_fiscalia_v13c_dic_2022_3$sud_rec))
))
| term | class | estimate | conf.low | conf.high | p.value |
|---|---|---|---|---|---|
| accuracy | NA | 0.9682 | 0.9667 | 0.9697 | 1 |
| kappa | NA | 0.01342 | NA | NA | NA |
| mcnemar | NA | NA | NA | NA | 2.423e-246 |
| sensitivity | 0 | 0.9712 | NA | NA | NA |
| specificity | 0 | 0.09341 | NA | NA | NA |
| pos_pred_value | 0 | 0.9969 | NA | NA | NA |
| neg_pred_value | 0 | 0.0108 | NA | NA | NA |
| precision | 0 | 0.9969 | NA | NA | NA |
| recall | 0 | 0.9712 | NA | NA | NA |
| f1 | 0 | 0.9838 | NA | NA | NA |
| prevalence | 0 | 0.9966 | NA | NA | NA |
| detection_rate | 0 | 0.9679 | NA | NA | NA |
| detection_prevalence | 0 | 0.9709 | NA | NA | NA |
| balanced_accuracy | 0 | 0.5323 | NA | NA | NA |
cat(":::", "\n")
cat("::: {.panel}", "\n", "[Violent]{.panel-name}", "\n")
cat("**Total records of offenses vs. TOP reported offenses**")
paste0("**Records from PO**")
paste0("**Reports from TOP**")
pol_p_vio<-
polychor(Base_fiscalia_v13c_dic_2022_3$vio_top, Base_fiscalia_v13c_dic_2022_3$vio, std.err=T, ML=T)
pol_p2_vio<-
psych::polychoric(table(Base_fiscalia_v13c_dic_2022_3$vio_top, Base_fiscalia_v13c_dic_2022_3$vio))
cat("**Polychoric correlation: Total records of offenses vs. TOP reported offenses**")
message(capture.output(print(pol_p_vio))[c(2,3)])
cat("**Recode**")
pander::pander(tidy(
caret::confusionMatrix(factor(Base_fiscalia_v13c_dic_2022_3$vio_top), factor(Base_fiscalia_v13c_dic_2022_3$vio_rec))
))
| term | class | estimate | conf.low | conf.high | p.value |
|---|---|---|---|---|---|
| accuracy | NA | 0.8235 | 0.8203 | 0.8267 | 1 |
| kappa | NA | 0.01975 | NA | NA | NA |
| mcnemar | NA | NA | NA | NA | 0 |
| sensitivity | 0 | 0.8257 | NA | NA | NA |
| specificity | 0 | 0.564 | NA | NA | NA |
| pos_pred_value | 0 | 0.9972 | NA | NA | NA |
| neg_pred_value | 0 | 0.01706 | NA | NA | NA |
| precision | 0 | 0.9972 | NA | NA | NA |
| recall | 0 | 0.8257 | NA | NA | NA |
| f1 | 0 | 0.9034 | NA | NA | NA |
| prevalence | 0 | 0.9947 | NA | NA | NA |
| detection_rate | 0 | 0.8213 | NA | NA | NA |
| detection_prevalence | 0 | 0.8236 | NA | NA | NA |
| balanced_accuracy | 0 | 0.6949 | NA | NA | NA |
| sensitivity | 1 | 0.4326 | NA | NA | NA |
| specificity | 1 | 0.8553 | NA | NA | NA |
| pos_pred_value | 1 | 0.01541 | NA | NA | NA |
| neg_pred_value | 1 | 0.9965 | NA | NA | NA |
| precision | 1 | 0.01541 | NA | NA | NA |
| recall | 1 | 0.4326 | NA | NA | NA |
| f1 | 1 | 0.02976 | NA | NA | NA |
| prevalence | 1 | 0.005207 | NA | NA | NA |
| detection_rate | 1 | 0.002253 | NA | NA | NA |
| detection_prevalence | 1 | 0.1462 | NA | NA | NA |
| balanced_accuracy | 1 | 0.644 | NA | NA | NA |
| sensitivity | 2 | 0 | NA | NA | NA |
| specificity | 2 | 0.9698 | NA | NA | NA |
| pos_pred_value | 2 | 0 | NA | NA | NA |
| neg_pred_value | 2 | 0.9999 | NA | NA | NA |
| precision | 2 | 0 | NA | NA | NA |
| recall | 2 | 0 | NA | NA | NA |
| f1 | 2 | NA | NA | NA | NA |
| prevalence | 2 | 0.0001292 | NA | NA | NA |
| detection_rate | 2 | 0 | NA | NA | NA |
| detection_prevalence | 2 | 0.03023 | NA | NA | NA |
| balanced_accuracy | 2 | 0.4849 | NA | NA | NA |
cat("**Kappa with ordinal weights**")
pander::pander(
irrCAC::kappa2.table(table(ordered(Base_fiscalia_v13c_dic_2022_3$vio_top), ordered(Base_fiscalia_v13c_dic_2022_3$vio_rec)),weights = irrCAC::ordinal.weights(0:2))
)
| coeff.name | coeff.val | coeff.se | coeff.ci | coeff.pval |
|---|---|---|---|---|
| Cohen’s Kappa | 0.01956 | 0.00187 | (0.016,0.023) | 0e+00 |
cat("**Plot**")
ggstatsplot::ggscatterstats(vio_top, vio, data=Base_fiscalia_v13c_dic_2022_3,
type="nonparametric", point.width.jitter=.9, point.height.jitter=.9,
xlab="Number of TOP offenses", ylab="Number of PO offenses",
title="Relationship between counts of violent offenses from PO and TOP")
cat(":::", "\n")
cat("::: {.panel}", "\n", "[Other]{.panel-name}", "\n")
cat("**Total records of offenses vs. TOP reported offenses**")
paste0("**Records from PO**")
paste0("**Reports from TOP**")
pol_p_oth<-
polychor(Base_fiscalia_v13c_dic_2022_3$oth_top, Base_fiscalia_v13c_dic_2022_3$oth, std.err=T, ML=T)
pol_p2_oth<-
psych::polychoric(table(Base_fiscalia_v13c_dic_2022_3$oth_top, Base_fiscalia_v13c_dic_2022_3$oth))
cat("**Polychoric correlation: Total records of offenses vs. TOP reported offenses**")
message(capture.output(print(pol_p_oth))[c(2,3)])
cat("**Recode**")
pander::pander(tidy(
caret::confusionMatrix(factor(Base_fiscalia_v13c_dic_2022_3$oth_top), factor(Base_fiscalia_v13c_dic_2022_3$oth_rec))
))
| term | class | estimate | conf.low | conf.high | p.value |
|---|---|---|---|---|---|
| accuracy | NA | 0.9648 | 0.9632 | 0.9663 | 1 |
| kappa | NA | 0.01875 | NA | NA | NA |
| mcnemar | NA | NA | NA | NA | 1.654e-199 |
| sensitivity | 0 | 0.9701 | NA | NA | NA |
| specificity | 0 | 0.08669 | NA | NA | NA |
| pos_pred_value | 0 | 0.9944 | NA | NA | NA |
| neg_pred_value | 0 | 0.01707 | NA | NA | NA |
| precision | 0 | 0.9944 | NA | NA | NA |
| recall | 0 | 0.9701 | NA | NA | NA |
| f1 | 0 | 0.9821 | NA | NA | NA |
| prevalence | 0 | 0.994 | NA | NA | NA |
| detection_rate | 0 | 0.9643 | NA | NA | NA |
| detection_prevalence | 0 | 0.9697 | NA | NA | NA |
| balanced_accuracy | 0 | 0.5284 | NA | NA | NA |
cat(":::", "\n")
#
cat("Manually construct measure tables")
#http://rstudio-pubs-static.s3.amazonaws.com/370944_96c386c03ac54ef3bec4535d49e92890.html
confusion_table = table(Base_fiscalia_v13c_dic_2022_3$tot_off_top_bin, Base_fiscalia_v13c_dic_2022_3$tot_bin)
confusion_table[1,1] = 'TN'
confusion_table[1,2] = 'FN'
confusion_table[2,1] = 'FP'
confusion_table[2,2] = 'TP'
get_accuracy <- function(df, predicted, actual){
confusion_table = table(df[[predicted]],df[[actual]])
TP = confusion_table[2,2]
TN = confusion_table[1,1]
FN = confusion_table[1,2]
FP = confusion_table[2,1]
accuracy = round((TP + TN) / sum(TP,FP,TN,FN), 2)
return(accuracy)
}
get_classification_error_rate <- function(df, predicted, actual){
confusion_table = table(df[[predicted]],df[[actual]])
TP = confusion_table[2,2]
TN = confusion_table[1,1]
FN = confusion_table[1,2]
FP = confusion_table[2,1]
classification_error_rate = round((FP + FN) / sum(TP,FP,TN,FN),2)
return(classification_error_rate)
}
get_precision <- function(df, predicted, actual){
confusion_table = table(df[[predicted]],df[[actual]])
TP = confusion_table[2,2]
TN = confusion_table[1,1]
FN = confusion_table[1,2]
FP = confusion_table[2,1]
precision = round(TP / (TP + FP), 2)
return(precision)
}
get_sensitivity <- function(df, predicted, actual){
confusion_table = table(df[[predicted]],df[[actual]])
TP = confusion_table[2,2]
TN = confusion_table[1,1]
FN = confusion_table[1,2]
FP = confusion_table[2,1]
sensitivity = round(TP / (TP + FN), 2)
return(sensitivity)
}
get_specificity <- function(df, predicted, actual){
confusion_table = table(df[[predicted]],df[[actual]])
TP = confusion_table[2,2]
TN = confusion_table[1,1]
FN = confusion_table[1,2]
FP = confusion_table[2,1]
specificity = round(TN / (TN + FP), 2)
return(specificity)
}
get_f1_score <- function(df, predicted, actual){
confusion_table = table(df[[predicted]],df[[actual]])
TP = confusion_table[2,2]
TN = confusion_table[1,1]
FN = confusion_table[1,2]
FP = confusion_table[2,1]
precision = round(TP / (TP + FP), 2)
sensitivity = round(TP / (TP + FN), 2)
f1_score = round((2 * precision * sensitivity) / (precision + sensitivity), 2)
return(f1_score)
}
caret::confusionMatrix(factor(Base_fiscalia_v13c_dic_2022_3$tot_off_top_bin), factor(Base_fiscalia_v13c_dic_2022_3$tot_bin))
score = data.frame(accuracy=get_accuracy(Base_fiscalia_v13c_dic_2022_3, 'tot_off_top_bin', 'tot_bin'),
classification_error_rate=get_classification_error_rate(Base_fiscalia_v13c_dic_2022_3, 'tot_off_top_bin', 'tot_bin'),
precision=get_precision(Base_fiscalia_v13c_dic_2022_3, 'tot_off_top_bin', 'tot_bin'),
sensitivity=get_sensitivity(Base_fiscalia_v13c_dic_2022_3, 'tot_off_top_bin', 'tot_bin'),
specificity=get_specificity(Base_fiscalia_v13c_dic_2022_3, 'tot_off_top_bin', 'tot_bin'),
f1_score=get_f1_score(Base_fiscalia_v13c_dic_2022_3, 'tot_off_top_bin', 'tot_bin'))
#We can also combine precision and recall into an F1 score. This is the harmonic mean of precision and recall.
knitr::kable(score, caption= "Table 1. Properties")# %>% kableExtra::kable_classic()
For ROC curves, we dichotomized into committing a crime or not against reporting or not (*_bin). Posteriorly, we calculated confidence intervals using bootstrap with 500 replications.
#Acquisitive SUD Violent Other
suppressMessages(suppressWarnings(library(ROCit)))
## Warning: package 'ROCit' was built under R version 3.5.2
message("Error in convertclass(class, reference = negref) : class must have exactly two unique values)")
cat("::::: {.panelset}", "\n")
cat("::: {.panel}", "\n", "[Total]{.panel-name}", "\n")
#FPR: 1-especificidad
#TPR: Sensibilidad
cat(":::", "\n")
cat("::: {.panel}", "\n", "[Acquisitive]{.panel-name}", "\n")
cat(":::", "\n")
cat("::: {.panel}", "\n", "[SUD]{.panel-name}", "\n")
cat(":::", "\n")
cat("::: {.panel}", "\n", "[Violent]{.panel-name}", "\n")
cat(":::", "\n")
cat("::: {.panel}", "\n", "[Other]{.panel-name}", "\n")
cat(":::", "\n")
cat(":::::", "\n")
a_tab1_lab_aft_d<- paste0('Original C1 Dataset \n(p= ', formatC(CONS_C1%>% dplyr::distinct(HASH_KEY)%>% nrow(), format='f', big.mark=',', digits=0), ';\np= ', formatC(nrow(CONS_C1), format='f', big.mark=',', digits=0),')')
a_tab2_lab_aft_d<- paste0('•Remove duplicated entries\\\\\\l•Overlapping treatments of patients\\\\\\l•Intermediate treatment events (continuous referrals) \\\\\\l')
a_tab3_lab_aft_d<- paste0(' C1 Dataset \n(p= ', formatC(CONS_C1_df_dup_SEP_2020%>% dplyr::distinct(hash_key)%>% nrow(), format='f', big.mark=',', digits=0), ';\nn= ', formatC(nrow(CONS_C1_df_dup_SEP_2020), format='f', big.mark=',', digits=0),')')
a_tab4_lab_aft_d<- paste0('Original Prosecutors Office\n(p= ',Base_fiscalia_v2%>% dplyr::distinct(rut_enc_saf)%>% nrow() %>% format(big.mark=','), ';\nCauses= ',Base_fiscalia_v2%>% dplyr::distinct(ruc)%>% nrow() %>% format(big.mark=','), ';\nRel.=',Base_fiscalia_v2%>%dplyr::distinct(idrelacion)%>%nrow()%>%format(big.mark=','), ';\nRUC_Vic_Imp=',Base_fiscalia_v2%>%dplyr::mutate(rel=paste0(ruc,"_",idsujeto_victima,"_",idsujeto_imputado,"_","iddelito"))%>%dplyr::distinct(rel)%>%nrow()%>%format(big.mark=','), ';\nn= ',format(nrow(Base_fiscalia_v2),big.mark=","),')')
#crimes committed after study follow-up
a_tab5_lab_aft_d1<-
paste0("(p= ",format(nrow(unique(subset(Base_fiscalia_v2,fec_comision_simple>as.Date("2019-11-13"),"rut_enc_saf"))),big.mark=","),"; RUCs= ", format(nrow(unique(subset(Base_fiscalia_v2,fec_comision_simple>as.Date("2019-11-13"),"ruc"))), big.mark=","),";n= ", format(nrow(subset(Base_fiscalia_v2,fec_comision_simple>as.Date("2019-11-13"),"rut_enc_saf")), big.mark=","),")")
#erase entries with missing values in fec_comision_simple y termino_relacion_simple
leftovers_Base_fiscalia_v3<-
Base_fiscalia_v3 %>%
dplyr::left_join(after_imp_Base_fiscalia_v3_db[,c("rut_enc_saf","imp_birth_date","flowch_age")], by="rut_enc_saf") %>%
dplyr::rename("obs"="flowch_age") %>%
dplyr::mutate(imp_birth_date=dplyr::case_when(!is.na(imp_birth_date)~imp_birth_date,T~fec_nacimiento_simple))%>% dplyr::mutate(edad_comision_imp=as.numeric(fec_comision_simple-imp_birth_date)/365.25) %>% dplyr::mutate(edad_ter_rel_imp=as.numeric(termino_relacion_simple-imp_birth_date)/365.25) %>%
#arrange the rut from the first date of comission of a crime, but we are not detecting if he/she is the victim or not
dplyr::arrange(rut_enc_saf, edad_comision_imp) %>% #566884
dplyr::filter(dplyr::case_when(!is.na(edad_comision_imp)~T,T~F)) %>% #566644
dplyr::filter(imp_birth_date=="1900-01-01"|is.na(imp_birth_date))
a_tab5_lab_aft_d12<-
paste0("(p= ",format(nrow(dplyr::distinct(leftovers_Base_fiscalia_v3,rut_enc_saf)),big.mark=","),"; RUCs= ",
format(nrow(dplyr::distinct(leftovers_Base_fiscalia_v3,ruc)), big.mark=","),";n= ",
format(nrow(leftovers_Base_fiscalia_v3), big.mark=","),")")
#minor to 14 years old
a_tab5_lab_aft_d13<-
paste0("(p= ",format(nrow(dplyr::distinct(dplyr::filter(Base_fiscalia_v4,edad_comision_imp<14),rut_enc_saf)),big.mark=","),"; RUCs= ", format(nrow(dplyr::distinct(dplyr::filter(Base_fiscalia_v4,edad_comision_imp<14),ruc)), big.mark=","),";n= ", format(nrow(dplyr::filter(Base_fiscalia_v4,edad_comision_imp<14)), big.mark=","),")")
#Remove duplicated entries
a_tab5_lab_aft_d2<-
paste0("(p= ",format(length(unique(eliminated_duplicates$rut_enc_saf)),big.mark=","),"; RUCs= ",
format(length(unique(eliminated_duplicates$ruc)), big.mark=","),";n= ",
format(nrow(eliminated_duplicates), big.mark=","),")")
#before 2010
a_tab5_lab_aft_d3<-
paste0("(p= ",format(nrow(dplyr::distinct(dplyr::filter(Base_fiscalia_v7,fec_comision_simple<"2010-01-01"),rut_enc_saf)),big.mark=","),"; RUCs= ",
format(nrow(dplyr::distinct(dplyr::filter(Base_fiscalia_v7,fec_comision_simple<"2010-01-01"),ruc)), big.mark=","),";n= ",
format(nrow(dplyr::filter(Base_fiscalia_v7,fec_comision_simple<"2010-01-01")), big.mark=","),")")
#remove administrative annulment
a_tab5_lab_aft_d4<-
paste0("(p= ",format(nrow(dplyr::distinct(dplyr::filter(Base_fiscalia_v7,agrupa_terminos=="ANULACI¿N ADMINISTRATIVA"),rut_enc_saf)),big.mark=","),"; RUCs= ",
format(nrow(dplyr::distinct(dplyr::filter(Base_fiscalia_v7,agrupa_terminos=="ANULACI¿N ADMINISTRATIVA"),ruc)), big.mark=","),";n= ",
format(nrow(dplyr::filter(Base_fiscalia_v7,agrupa_terminos=="ANULACI¿N ADMINISTRATIVA")), big.mark=","),")")
#remove grouped to another case
a_tab5_lab_aft_d5<-
paste0("(p= ",format(nrow(dplyr::distinct(dplyr::filter(Base_fiscalia_v7,agrupa_terminos=="AGRUPACI¿N A OTRO CASO"),rut_enc_saf)),big.mark=","),"; RUCs= ",
format(nrow(dplyr::distinct(dplyr::filter(Base_fiscalia_v7,agrupa_terminos=="AGRUPACI¿N A OTRO CASO"),ruc)), big.mark=","),";n= ",
format(nrow(dplyr::filter(Base_fiscalia_v7,agrupa_terminos=="AGRUPACI¿N A OTRO CASO")), big.mark=","),")")
a_tab5_lab_aft_d<- paste0('•Filter crimes committed after study follow-up period',a_tab5_lab_aft_d1,'\\\\\\l•Remove duplicated entries',a_tab5_lab_aft_d2,'\\\\\\l•Correct dates (birth, comission of crime, end of judicial proceedings), missing nationality and sex\\\\\\l•Erase entries with missing values in comission of crime, end of judicial proceedings',a_tab5_lab_aft_d12,'\\\\\\l•Erase entries with values in comission of crime when minor to 14 years old after imputation',a_tab5_lab_aft_d13,'\\\\\\l•Filter crimes committed before study follow-up',a_tab5_lab_aft_d3,'\\\\\\l•Filter records with cause of end of the proceedings= administrative annulment',a_tab5_lab_aft_d4,'\\\\\\l•Filter records with cause of end of the proceedings= grouped to another case',a_tab5_lab_aft_d5,'\\\\\\l')
a_tab6_lab_aft_d<- paste0("O.P. Dataset \n(p= ", dplyr::distinct(Base_fiscalia_v8, rut_enc_saf)%>% nrow()%>% formatC(big.mark = ","), ";\nn= ",formatC(nrow(Base_fiscalia_v8),big.mark = ","),")")
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#DISCARD not coded as an offender
a_tab7_lab_aft_d1<-
paste0("(p= ",format(nrow(dplyr::distinct(dplyr::filter(Base_fiscalia_v8,!grepl("SI",encontrado_como_imputado)),rut_enc_saf)),big.mark=","),"; RUCs= ",
format(nrow(dplyr::distinct(dplyr::filter(Base_fiscalia_v8,!grepl("SI",encontrado_como_imputado)),ruc)), big.mark=","),";n= ",
format(nrow(dplyr::filter(Base_fiscalia_v8,!grepl("SI",encontrado_como_imputado))), big.mark=","),")")
#FILTER IF THE PATIENT RECIEVES FOR THE RELATIONSHIP AMONG THOSE THAT WERE OFFENDERS
# end of proceeding === SIMILAR TO Base_fiscalia_v9
a_tab7_lab_aft_d2<-
paste0("(p= ",format(nrow(dplyr::distinct(dplyr::filter(Base_fiscalia_v8, grepl("SI",encontrado_como_imputado)) %>%
dplyr::mutate(filter=dplyr::case_when(grepl("REPARATORIO|CONDICIONAL",toupper(agrupa_terminos)) & is.na(gls_mottermino)~1, grepl("REPARATORIO|SENTENCIA DEFINITIVA CONDENATORIA|240|MONIT", toupper(agrupa_terminos), ignore.case=F)~2, T~0))%>% dplyr::filter(filter==0),rut_enc_saf)),big.mark=","),"; RUCs= ",
format(nrow(dplyr::distinct(dplyr::filter(Base_fiscalia_v8, grepl("SI",encontrado_como_imputado)) %>%
dplyr::mutate(filter=dplyr::case_when(grepl("REPARATORIO|CONDICIONAL",toupper(agrupa_terminos)) & is.na(gls_mottermino)~1, grepl("REPARATORIO|SENTENCIA DEFINITIVA CONDENATORIA|240|MONIT", toupper(agrupa_terminos), ignore.case=F)~2, T~0))%>% dplyr::filter(filter==0),ruc)), big.mark=","),";n= ",
format(nrow(dplyr::filter(Base_fiscalia_v8, grepl("SI",encontrado_como_imputado)) %>%
dplyr::mutate(filter=dplyr::case_when(grepl("REPARATORIO|CONDICIONAL",toupper(agrupa_terminos)) & is.na(gls_mottermino)~1, grepl("REPARATORIO|SENTENCIA DEFINITIVA CONDENATORIA|240|MONIT", toupper(agrupa_terminos), ignore.case=F)~2, T~0))%>% dplyr::filter(filter==0)), big.mark=","),")")
CONS_TOP_2022_anti<-
# 107307
CONS_TOP%>%
dplyr::left_join(subset(dplyr::mutate(dplyr::group_by(Base_fiscalia_v11b_dic_2022, hash_key), hash_rn=row_number())%>% ungroup(), hash_rn==1), by= c("HASH_KEY" = "hash_key"))%>%
dplyr::mutate(fech_ap_top_num= as.numeric(as.Date(str_sub(as.character(lubridate::parse_date_time(Fecha.Aplicación.TOP, c("%Y-%m-%d"),exact=T)),1,10))))%>% #No parse failures
dplyr::select(HASH_KEY, fech_ap_top_num, Fecha.Aplicación.TOP, dateofbirth_imp, Hurto, Robo, Venta.Drogas, Riña, Total.VIF, Otro) %>%
dplyr::filter(!is.na(HASH_KEY)) %>%
dplyr::mutate_at(vars("Hurto", "Robo", "Venta.Drogas", "Riña", "Otro"), ~ifelse(.=="S",1,0)) %>%
dplyr::mutate(Total.VIF= ifelse(Total.VIF>0,1,0))%>%
dplyr::mutate(tot_off_top = base::rowSums(dplyr::select(.,c(Hurto, Robo, Venta.Drogas, Riña, Total.VIF, Otro)), na.rm = T)) %>%
dplyr::mutate(dateofbirth_imp_num= as.numeric(dateofbirth_imp),
fech_ap_top= lubridate::parse_date_time(Fecha.Aplicación.TOP, c("%Y-%m-%d"),exact=T),
edad_a_ap_top_num= lubridate::time_length(lubridate::interval(dateofbirth_imp, fech_ap_top),unit="years"),
edad_b_ap_top_num= (fech_ap_top_num-dateofbirth_imp_num)/365.25,
edad_a_ap_top_num_lim= edad_a_ap_top_num-(1/12),
edad_b_ap_top_num_lim= edad_b_ap_top_num-(1/12)) %>%
dplyr::select(-dateofbirth_imp, -dateofbirth_imp_num) %>%
dplyr::filter(!is.na(edad_a_ap_top_num)) %>%
dplyr::group_by(HASH_KEY, edad_a_ap_top_num) %>%
dplyr::slice(-1) %>%
dplyr::ungroup()
a_tab7_lab_aft_d25<-
paste0("(p= ",format(length(unique(CONS_TOP_2022_anti$HASH_KEY)),big.mark=","),"; n= ",
format(length(CONS_TOP_2022_anti$HASH_KEY), big.mark=","),")")
a_tab7_lab_aft_d3<-
paste0("(p= ",format(nrow(dplyr::distinct(dplyr::anti_join(CONS_C1_df_dup_SEP_2020, CONS_C1_df_dup_SEP_2020_22_d, by=c("hash_key","dup")),hash_key)),big.mark=","),"; n= ",
format(nrow(dplyr::anti_join(CONS_C1_df_dup_SEP_2020, CONS_C1_df_dup_SEP_2020_22_d, by=c("hash_key","dup"))), big.mark=","),")")
a_tab8_lab_aft_d <- paste0('Offenses previous\nto the first admission\n',"(p= ",format(length(unique(Base_fiscalia_v10b_dic_2022[,"hash_key"])),big.mark=","),"; RUCs= ",
format(length(unique(Base_fiscalia_v10b_dic_2022[,"caseid"])), big.mark=","),";\nn= ",
format(length(Base_fiscalia_v10b_dic_2022[,"caseid"]), big.mark=","),")")
a_tab9_lab_aft<- paste0('•Discard observations coded as victims rather than ofenders ',tab7_lab_aft_d1,'\\\\\\l•Discard observations depending on the values of the end of the proceedings among offenders ',tab7_lab_aft_d2,'\\\\\\l•Discard TOP applications with duplicated dates and user ',a_tab7_lab_aft_d25,'\\\\\\l•Get the first treatment of each user ',a_tab7_lab_aft_d3,'\\\\\\l')
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#// ON x.hash_key == y.id AND x.edad_al_egres_imp > y.age_offending_imp AND x.dup = 1"
library(DiagrammeR)
plot_merge_flowchart_excercise<-
grViz("digraph flowchart {
fontname='Comic Sans MS'
# node definitions with substituted label text
node [shape = rectangle,fontsize = 9]
tab1 [label = '@@1']
blank [label = '', width = 0.0001, height = 0.0001]
tab2 [label = '@@2',fontsize = 7]
tab3 [label = '@@3']
tab4 [label = '@@4',fontsize = 8]
blank2 [label = '', width = 0.0001, height = 0.0001]
tab5 [label = '@@5',fontsize = 7]
tab6 [label= '@@6']
blank3 [label = '', width = 0.0001, height = 0.0001]
blank4 [label = '', width = 0.0001, height = 0.0001]
blank5 [label = '', width = 0.0001, height = 0.0001]
blank6 [label = '', width = 0.0001, height = 0.0001]
tab7 [label = '@@7',fontsize = 7]
tab8 [label= '@@8']
# edge definitions with the node IDs
rankdir='TB'; rank= same; tab1 -> blank [arrowhead = none,label=' Data wrangling and normalization process',fontsize = 8];
rankdir='TB'; rank= same; tab1; tab3;
blank -> tab2;
subgraph {
rank = same; tab2; blank;
}
rankdir='TB'; rank= same; blank -> tab3;
tab4 -> blank2 [arrowhead = none,label=' Data wrangling and normalization process',fontsize = 8];
blank2 -> tab5
blank2 -> tab6
blank4 -> blank6 [arrowhead= none, label=' ⟖']
blank6 -> tab7
subgraph {
rank = same; blank6; tab7;
}
# bring_db1_a
blank6 -> tab8 [label= ' Merge records where discharge age is greater than age of offenses', fontsize = 8]
subgraph {
rank = same; tab5; blank2;
}
subgraph {
rank= same; tab3 -> blank3 -> blank4 -> blank5 -> tab6 [arrowhead= none]
}
}
subgraph {
rank = same; tab3; tab6;
}
subgraph {
rank = same; tab1; tab4;
}
subgraph {
rank = same; tab2; tab5;
}
subgraph {
rank = same; tab1; tab3;
rankdir=TB
rank=same
}
subgraph {
rank = same; tab4; tab6;
rankdir=TB
rank=same
}
[1]: a_tab1_lab_aft_d
[2]: a_tab2_lab_aft_d
[3]: a_tab3_lab_aft_d
[4]: a_tab4_lab_aft_d
[5]: a_tab5_lab_aft_d
[6]: a_tab6_lab_aft_d
[7]: a_tab9_lab_aft
[8]: a_tab8_lab_aft_d
", width = 1200,
height = 900)
DPI = 1200
WidthCM = 11
HeightCM = 8
plot_merge_flowchart_excerciselibrary(DiagrammeRsvg)
plot_merge_flowchart_excercise %>%
export_svg %>% charToRaw %>% rsvg::rsvg_pdf("./_figs/_flowchart_comm_DAD.pdf")
plot_merge_flowchart_excercise %>% DiagrammeRsvg::export_svg()%>%charToRaw %>% rsvg::rsvg(width = WidthCM *(DPI/2.54), height = HeightCM *(DPI/2.54)) %>% png::writePNG("./_figs/_flowchart_comm_DAD.png")
htmlwidgets::saveWidget(plot_merge_flowchart_excercise, "./_figs/_flowchart_comm_DAD.html")
webshot::webshot("./_figs/_flowchart_comm_DAD.html", "./_figs/_flowchart_comm_DAD.png",vwidth = 1200, vheight = 900, zoom = 3)
webshot::webshot("./_figs/_flowchart_comm_DAD.html", "./_figs/_flowchart_comm_DAD_alt.pdf")
a_tab10_lab_aft_d<- paste0("TOP Dataset \n(p= ", dplyr::distinct(CONS_TOP, HASH_KEY)%>% nrow()%>% formatC(big.mark = ","),";\nn= ",formatC(nrow(CONS_TOP),big.mark = ","),")")
#bring_db3
a_tab11_lab_aft_d1<-
paste0( '(p= ',format(nrow(dplyr::distinct(dplyr::group_by(Base_fiscalia_v9, id, caseid, end_type, fec_comision_simple, crime_code_c) %>% dplyr::slice(-1) %>% dplyr::ungroup(),id)), big.mark=","), '; RUCs= ', format(nrow(dplyr::distinct(dplyr::group_by(Base_fiscalia_v9, id, caseid, end_type, fec_comision_simple, crime_code_c) %>% dplyr::slice(-1) %>% dplyr::ungroup(),caseid)), big.mark=","), '; n= ', format(nrow(dplyr::group_by(Base_fiscalia_v9, id, caseid, end_type, fec_comision_simple, crime_code_c) %>% dplyr::slice(-1) %>% dplyr::ungroup()), big.mark=",") , ')'
)
message(
paste0("Patients in PO records with unique combination of ID, RUC, end type and date of comission of the offense and offense ", a_tab11_lab_aft_d1)
)
a_tab11_lab_aft_d<- paste0('•Count events such as shoplifting, theft, domestic violence, drug selling, fights, or related behavior in the last four weeks','\\\\\\l',
'•Get PO data with more than one record with the same case ID, sentence, date of offense and type of crime ', a_tab11_lab_aft_d1,'\\\\\\l')
a_tab12_lab_aft_d<-
paste0('Merged database\n(p= ',format(nrow(dplyr::distinct(Base_fiscalia_v13c_dic_2022_3,HASH_KEY)), big.mark=","), '; n= ', format(nrow(Base_fiscalia_v13c_dic_2022_3), big.mark=",") , ')'
)
plot_merge_flowchart_excercise2<-
grViz("digraph flowchart {
fontname='Comic Sans MS'
# edge definitions with the node IDs
# node definitions with substituted label text
node [shape = rectangle, fontsize = 9]
tab1 [label = '@@1', fontsize = 7]
tab2 [label = '@@2', fontsize = 7]
blank [label = '', width = 0.0001, height = 0.0001]
blanka [label = '', width = 0.0001, height = 0.0001]
blankb [label = '', width = 0.0001, height = 0.0001]
# bring_db2
subgraph {
rank = same; tab1 -> blanka -> blank -> blankb -> tab2 [arrowhead= none]
}
tab3 [label = '@@3',fontsize = 6]
blank2 [label = '', width = 0.0001, height = 0.0001]
blank -> blank2 [arrowhead= none, label=' ⟖']
subgraph {
rank = same; blank2 -> tab3
}
tab4 [label = '@@4',fontsize = 9]
blank2 -> tab4 [label=' Select records of offenses committed before the application of TOP but\n at least the last month previous to the application of TOP \\\\\\l',fontsize = 7];
}
[1]: a_tab8_lab_aft_d
[2]: a_tab10_lab_aft_d
[3]: a_tab11_lab_aft_d
[4]: a_tab12_lab_aft_d
[5]: ''
[6]: ''
[7]: ''
[8]: ''
", width = 1200,
height = 900)
plot_merge_flowchart_excercise2
plot_merge_flowchart_excercise2 %>%
export_svg %>% charToRaw %>% rsvg::rsvg_pdf("./_figs/_flowchart2_comm_DAD.pdf")
plot_merge_flowchart_excercise2 %>% DiagrammeRsvg::export_svg()%>%charToRaw %>% rsvg::rsvg(width = WidthCM *(DPI/2.54), height = HeightCM *(DPI/2.54)) %>% png::writePNG("./_figs/_flowchart2_comm_DAD.png")
htmlwidgets::saveWidget(plot_merge_flowchart_excercise2, "./_figs/_flowchart2_comm_DAD.html")
webshot::webshot("./_figs/_flowchart2_comm_DAD.html", "./_figs/_flowchart2_comm_DAD.png",vwidth = 1200, vheight = 900, zoom = 3)
webshot::webshot("./_figs/_flowchart2_comm_DAD.html", "./_figs/_flowchart2_comm_DAD_alt.pdf")
message(Sys.getenv("R_LIBS_USER"))
Sys.Date()
[1] "2023-02-18"
if (grepl("CISS Fondecyt",rstudioapi::getSourceEditorContext()$path)==T){
save.image("C:/Users/CISS Fondecyt/Mi unidad/Alvacast/SISTRAT 2022 (github)/14_alt.RData")
} else if (grepl("andre",rstudioapi::getSourceEditorContext()$path)==T){
save.image("C:/Users/andre/Desktop/SUD_CL/14_alt.RData")
} else if (grepl("E:",rstudioapi::getSourceEditorContext()$path)==T){
save.image("E:/Mi unidad/Alvacast/SISTRAT 2022 (github)/14_alt.RData")
} else {
save.image(paste0(sub("2019","2022",sub("SUD_CL","",path)),"14_alt.RData"))
}
#load(paste0(dirname(rstudioapi::getSourceEditorContext()$path),"/14_alt.RData"))
sesion_info <- devtools::session_info()
dplyr::select(
tibble::as_tibble(sesion_info$packages),
c(package, loadedversion, source)
) %>%
DT::datatable(filter = 'top', colnames = c('Row number' =1,'Variable' = 2, 'Percentage'= 3),
caption = htmltools::tags$caption(
style = 'caption-side: top; text-align: left;',
'', htmltools::em('Packages')),
options=list(
initComplete = htmlwidgets::JS(
"function(settings, json) {",
"$(this.api().tables().body()).css({
'font-family': 'Helvetica Neue',
'font-size': '50%',
'code-inline-font-size': '15%',
'white-space': 'nowrap',
'line-height': '0.75em',
'min-height': '0.5em'
});",#;
"}")))
#put name of the file
file<- "fiscalia_ags_jan_2023_match_SENDA.dta"
export_lab_stata_merge<-
tibble::rownames_to_column(data.frame(Hmisc::label(dplyr::select(Base_fiscalia_v13, all_of(c("hash_key", cont_vars_desc, cat_vars_desc, cat_vars_desc_off, vars_desc, "fech_ing_num_1", "fech_egres_imp", "cut_com_del", "cut_fec_nac","offender_d", "comuna_residencia_cod_rec", "porc_pobr", "Clasificación"))))))%>% data.frame() %>%
dplyr::rename("code" = !!names(.[1]), "label" = !!names(.[2]))%>% data.frame()%>%
dplyr::mutate(first= "cap noi label variable")%>%
dplyr::mutate(final= paste0(first, " ",code,' "',label,'"'))%>%
dplyr::select(-code,-label,-first)%>%
dplyr::rename("*clear all"="final") %>%
rbind(paste0('cap noi save "', gsub('/', '\\', path, fixed=T),'\\',file,'", replace'))%>%
rbind(paste0('cap noi save "', gsub('/', '\\', path, fixed=T),'\\',file,'", replace'))
rbind(paste0('cap noi use "', gsub('/', '\\', path, fixed=T),'\\',file,'", clear'),export_lab_stata_merge) %>% knitr::kable("markdown")
write.table(rbind(paste0('cap noi use "', gsub('/', '\\', path, fixed=T),'\\',file,'", clear'),export_lab_stata_merge), file = paste0(path,"/_label_var_to_stata_jan2023.do"), sep = "",row.names = FALSE, quote = FALSE, fileEncoding="UTF-8")
*should be in the same folder of the .Rmd to work
cap noi do _label_var_to_stata_jan2023Error in running command st